# Dependent libraries
library(dplyr)
library(gtools)
library(gmodels)
library(ggplot2)
library(class)
library(tidyr)
library(ROCR)
library(magrittr)
library(broom)
library(plotROC)
library(readxl)
library(recipes)
library(caret)

# Import Data
heart <- read_excel("Cleve (Numeric).xlsx")

# Check the data types of all variables
str(heart)

# Change data type of all variables to numeric
# Removed "?" special characters
heart <- filter(heart, ca != '?' & thal != '?')
heart <- heart %>% mutate(age = as.numeric(age),
                          sex = as.numeric(sex),
                          cp = as.numeric(cp),
                          trestbps = as.numeric(trestbps),
                          chol = as.numeric(chol),
                          fbs = as.numeric(fbs),
                          restecg = as.numeric(restecg),
                          thalach = as.numeric(thalach),
                          exang = as.numeric(exang),
                          oldpeak = as.numeric(oldpeak),
                          slope = as.numeric(slope),
                          ca = as.numeric(ca),
                          thal = as.numeric(thal),
                          target = as.numeric(target))

# Checking if the data type got created properly or not
str(heart)

# Split data for training and testing
set.seed(100) # Random seed helps in dividing the data in similar ways when the code is run
index<-sample(nrow(heart),0.75*nrow(heart))
train<-heart[index,]
test<-heart[-index,]

# Write the training and testing data into Excel (Optional)
library(writexl)
write_xlsx(train, path = "Train.xlsx")
write_xlsx(test, path = "Test.xlsx")

# target~. implies target variable is predicted across all independent variables
model_lgr<-glm(target~.,data = train,family = "binomial") # family = "binomial" means it contains only two outcomes

# Output of the model
summary(model_lgr)
plot(model_lgr)

# Get coefficients of the generated model
output_values = data.frame(summary(model_lgr)$coefficients)
output_values
plot_coeffs <- function(mlr_model) 
{
  coeffs <- coefficients(mlr_model)
  mp <- barplot(coeffs, col="#3F97D0", xaxt='n', main="Regression Coefficients")
  lablist <- names(coeffs)
  text(mp, par("usr")[3], labels = lablist, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=0.6)
}

# Predicted probabilities on testing data
predicted <- predict(model_lgr, test, type="response")
plot_coeffs(model_lgr)

library(InformationValue)
optCutOff <- optimalCutoff(test$target, predicted)[1]
misClassError(test$target, predicted, threshold = optCutOff)
Concordance(test$target, predicted)
sensitivity(test$target, predicted, threshold = optCutOff)
specificity(test$target, predicted, threshold = optCutOff)
confusionMatrix(test$target, predicted, threshold = optCutOff)
plotROC(test$target, predicted)

# A Concordant pair is formed by a heart disease case and a normal case,
# where the predicted probability of the heart disease case is higher
# than the predicted probability of normal case. 

# Discordance: A pair where the predicted probability of the case is lesser 
# than the predicted probability of control is the discordant pair.

# Sensitivity & Specificity: A model with higher sensitivity & specificity is regarded as a good model.
# Sensitivity = True Positive Rate = 29/(29+5)
# Specificity = True Negative Rate = 41/(41+0)
# Mis-classification = (0+5)/(0+5+49+21)

# ROC curve assess the goodness of fit of the model.